org 100h        ; assume al=0 bx=0 sp=di=-2 si=0100h bp=09??h; last 16 bytes of PSP = 0
C16 dw 16
C100 dw 0xC4
RO equ $-1-4*3    ; dd 0.0, 0.0, -128.0
;00000100  1000              adc [bx+si],al
;00000102  C400              les ax,[bx+si]

;Video mode + palette: 4 bits orange * 4 bits blue. Uses default index 0 (black).
  mov al,13h
P:int 10h       ; set video mode | set palette index: bx=i dh=R ch=G cl=B
  dec di        ; di = pixel address = -3

;Each frame:
M:mov dx,0xA000-10-20-20-4;  ; visible pixels are A0000..AF9FF: want X=0 Y=0 in the center of the screen
  mov es,dx     ; dx:bx=YX:XX = es:0    (dx and bx are neighbors after PUSHA)

;Generate N gem normals at [bp+200h,300h]: 0 0 1 d=16, +-1 +-1 +-1 d=32
BIG equ $-1
  pusha
  mov cx,[si]
PL:
  add bp,si
  pusha
  mov bl,4

  fld1
  inc cx
  jp GNC
  fchs
GNC:fild word[di-3-16]
  fsincos

RR:
  dec bx
  dec bx
  call STORE                   ;|y x z         ;|x Y Z
  fild word[di-3]              ;|T
  fidiv word[bx+si]            ;|T/100         ;|T/16
                               ;|Y,Z=R(y,z)    ;|XX,ZZ=r(x,Z)
ROTATE:  ; { angle } a[bp] --> { Rx Rz }
  fsincos            ;| c s
ROTATE_CS:  ; { c s } a[bp] --> { Rx Rz }
  call LOAD_SCALE_XZ ;| sx c sz
  call LOAD_SCALE_XZ ;| cx sx cz sz
  fsubp st3,st0      ;| sx cz sz-cx
  faddp              ;| sx+cz sz-cx
  fld dword[bp+si+4]           ;|x Y Z         ;|Y XX ZZ
  jnz RR
  call NORMALIZE_STORE

  popa
  loop PL
  popa

;Each pixel:
X:   ;cx=T di=adr_pixel(init=0) bp=09?? si=0100 ah=0   ; cf=0
  inc dx
X2:
  fninit        ; adr:     -18 -16 -14 -12 -10  -8  -6  -4  -2
  pusha         ; stack:    di  si  bp  sp  bx  dx  cx  ax   0
  xor bx,bx     ; s16:  pixadr 100 9??  -2  ..X..Y  T  result

;Compute ray direction.
  fild word[byte BIG + si-100h] ; Z=27000
  fild word[bx-9]  ; X   |rD.xyz p.d
  fild word[bx-8]  ; Y
  call NORMALIZE_STORE ;bx=bp

;Hit the gem.
GEM_OUTER: ; ro[dx] rd[bp] --> cf=1_if_hit di=address_of_hit_facet   ; clobbers ax,bx,cx
  fild dword[si]  ;|tfront=0 tback=huge
  fldz
  mov cx,[si]
  lea bx,[bp+si]  ; bx = current gem; gem normals are at [bp+200h,300h,...]
G:
  fild word[byte C100 + si-100h] ; planes have distance 100

;ray-plane intersection
  call DOT        ;|D=pn*rd pd tf tb
  mov ax,RO-0x100
  xchg bp,ax
  call DOT        ;|pn*ro D pd tf tb
  xchg bp,ax
  fsubp st2,st0   ;|D N=pd-pn*ro
  ftst
  fnstsw ax
  sahf       ; cf=1 if we're in front of the plane
  fdivp st1,st0   ;|t=N/D tf tb
  jnc GBACK
GFRONT:
  fcom st0,st1
  fnstsw ax
  sahf
  jbe GNEXT         ;if t>tf { tf=t; di=hit_address = current; }
  fst st1
  mov di,bx
  jmp GNEXT
GBACK:
  fcom st0,st2
  fnstsw ax
  sahf
  jae GNEXT        ;if t<tf { tb=t; }
  fst st2
GNEXT:
  fstp st0
  fcom
  fnstsw ax
  sahf              ;if tf>=fb { no_hit: cf=0; early exit } else { cf=1 }
  jae GEXIT
  lea bx,[bx+si] ; don't set flags
  loop G
GEXIT:              ;i=adr_facet, bp=rd
  fld  dword[bp+si]
  jnc SKY
  mov  bx,di
  call DOT          ;|i*n    ;i[bx] n[bp]
  fmul dword[bx+si] ; ky
  fsubr dword[bp+si]
SKY:
  fmul st0         ;|rd.y^2 and skip gamma correction (so actually ^4)
  fimul word[si]
  fistp word[-4] ; pushed ax
  popa

; 4-bit builtin gray palette with cheapo (6$) dithering. (The multiplier must be 239, not 255.)
  add al,16
  stosb
  add bx,0xCCCD ;dx:bx = YXX += 0000CCCD
  jnc X2
  jnz X   ;do 65536 iterations

  inc cx  ; T++
  in al,60h
  dec ax
  jnz M   ; fallthrough

NORMALIZE_STORE:  ; { a.x .y .z } --> { n.x .y .z } a[bp](unnormalized) bx=bp
  call STORE        ;|a*a   ; [bp]=a (unnormalized)
  mov bx,bp
  call DOT          ;|a*a   ; [bp]=a (unnormalized)
  fsqrt
  fld1
  fdivrp st1        ;|rsqrt(a*a)        ...    will be: |rd.x rd.y rd.z
  call LOAD_SCALE
STORE: ; { a.x .y .z } --> a[bp]
  fstp dword[bp+si]
  fstp dword[bp+si+4]
  fstp dword[bp+si+8]
  ret
  
LOAD_SCALE: ; { k } a[bp] --> { k*x k*y k*z }
  fld dword[bp+si+4]
  fmul st1           ;|ky k
LOAD_SCALE_XZ:
  fld dword[bp+si+8]
  fmul st2
  fxch st2           ;|k ky kz
  fmul dword[bp+si]  ;|kx ky kz
  ret

DOT:  ; a[bp] b[bx] --> { (a dot b) }
  fld dword[bp+si]
  fmul dword[bx+si]
  fld dword[bp+si+4]
  fmul dword[bx+si+4]
  faddp
  fld dword[bp+si+8]
  fmul dword[bx+si+8]
  faddp
  ret
